##
## Friday October 23rd
## Day 3: “Data Exploration, Part 2: Basic RNA-seq Plots”
##
# before session:
# have 'ggplot2', 'pheatmap', 'RColorBrewer', 'reshape2' installed
# code:
# install.packages("ggplot2")
## Step 1: call ("activate") libraries (packages) we are going to use
library(ggplot2)
## Step 2: set working directory to source file (this script) location
getwd()
## [1] "/Users/florschlamp/Desktop/Day 3"
# Session > Set Working Directory > To Source File Location
## Step 3: read in data files
# counts and metadata
load("StartData.Rdata")
head(raw_counts_filt)
## C1 T1 C2 T2 C3 T3 C4 T4
## ENSG00000000003 723 486 904 445 1170 1097 806 604
## ENSG00000000419 467 523 616 371 582 781 417 509
## ENSG00000000457 347 258 364 237 318 447 330 324
## ENSG00000000460 96 81 73 66 118 94 102 74
## ENSG00000000938 0 0 1 0 2 0 0 0
## ENSG00000000971 3413 3916 6000 4308 6424 10723 5039 7803
sample_table
## id dex celltype sample_name
## 1 SRR1039508 control N61311 C1
## 2 SRR1039509 treated N61311 T1
## 3 SRR1039512 control N052611 C2
## 4 SRR1039513 treated N052611 T2
## 5 SRR1039516 control N080611 C3
## 6 SRR1039517 treated N080611 T3
## 7 SRR1039520 control N061011 C4
## 8 SRR1039521 treated N061011 T4
## Step 4: normalize counts by library size and log scale
lib_sizes=colSums(raw_counts_filt)
size.factor=lib_sizes/exp(mean(log(lib_sizes)))
norm_counts=t(t(raw_counts_filt)/size.factor)
head(norm_counts)
## C1 T1 C2 T2 C3
## ENSG00000000003 757.0258 554.9712 767.9000260 628.24187 1031.494629
## ENSG00000000419 488.9780 597.2221 523.2593097 523.77019 513.102457
## ENSG00000000457 363.3305 294.6144 309.1986830 334.59174 280.354950
## ENSG00000000460 100.5180 92.4952 62.0096260 93.17745 104.031082
## ENSG00000000938 0.0000 0.0000 0.8494469 0.00000 1.763239
## ENSG00000000971 3573.6226 4471.7434 5096.6815883 6081.94603 5663.522647
## T3 C4 T4
## ENSG00000000003 763.20953 906.0354 610.94038
## ENSG00000000419 543.36066 468.7553 514.84876
## ENSG00000000457 310.98875 370.9574 327.72298
## ENSG00000000460 65.39808 114.6596 74.85031
## ENSG00000000938 0.00000 0.0000 0.00000
## ENSG00000000971 7460.25140 5664.4075 7892.66184
norm_log_counts = log2(norm_counts+1)
head(norm_log_counts)
## C1 T1 C2 T2 C3 T3
## ENSG00000000003 9.566103 9.118866 9.5866522 9.297471 10.011919 9.577824
## ENSG00000000419 8.936573 9.224537 9.0341368 9.035542 9.005912 9.088419
## ENSG00000000457 8.509104 8.207573 8.2770488 8.390563 8.136248 8.285350
## ENSG00000000460 6.665591 6.546820 5.9775003 6.557310 6.714673 6.053070
## ENSG00000000938 0.000000 0.000000 0.8870939 0.000000 1.466360 0.000000
## ENSG00000000971 11.803575 12.126944 12.3156255 12.570554 12.467739 12.865202
## C4 T4
## ENSG00000000003 9.825015 9.257247
## ENSG00000000419 8.875766 9.010804
## ENSG00000000457 8.538994 8.360729
## ENSG00000000460 6.853741 6.245083
## ENSG00000000938 0.000000 0.000000
## ENSG00000000971 12.467964 12.946479
# Plot 1) comparing replicates
# pairwise scatterplot
norm_log_counts <- data.frame(norm_log_counts)
## base plot
# basic
plot(norm_log_counts$C1,norm_log_counts$C2)

## ggplot
# basic
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point()

# add line
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point() + geom_abline(slope=1, color="red")

# add better labels
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point() + geom_abline(slope=1, color="red") +
labs(title="pairwise scatterplot between samples C1 and C2",
x="norm counts for sample C1",
y="norm counts for sample C2")

# themes
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point() + theme_bw()

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point() + theme_classic()

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point() + theme_grey() # default?

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point() + theme_minimal()

# manipulate points
ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point(alpha=0.25)

ggplot(norm_log_counts, aes(x=C1, y=C2)) +
geom_point(alpha=0.25, color="blue")

# filtered counts
norm_log_counts_filt <- read.csv("normalized_counts_after_log_and_filtering.csv", row.names = 1)
ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
geom_point(alpha=0.25, color="blue")

# let's add some stats
rsq <- cor(norm_log_counts_filt$C1,norm_log_counts_filt$C2) ^ 2
rsq
## [1] 0.8649412
ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
geom_point(alpha=0.25) + geom_abline(slope=1, color="red") +
labs(title="pairwise scatterplot between samples C1 and C2",
subtitle="R squared = 0.8649412",
x="norm counts for sample C1",
y="norm counts for sample C2")

ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
geom_point(alpha=0.25) + geom_abline(slope=1, color="red") +
labs(title="pairwise scatterplot between samples C1 and C2",
subtitle=paste0("R squared = ",rsq),
x="norm counts for sample C1",
y="norm counts for sample C2")

round(rsq,digits=3)
## [1] 0.865
# final form:
ggplot(norm_log_counts_filt, aes(x=C1, y=C2)) +
geom_point(alpha=0.25) + geom_abline(slope=1, color="red") +
labs(title="pairwise scatterplot between samples C1 and C2",
subtitle=paste0("R squared = ",round(rsq,digits=3)),
x="norm counts for sample C1",
y="norm counts for sample C2")

## Any questions so far?
cor(norm_log_counts_filt$C1,norm_log_counts_filt$C2) ^ 2
## [1] 0.8649412
cor(norm_log_counts_filt$C1,norm_log_counts_filt$T1) ^ 2
## [1] 0.913021
# Plot 2) Comparing multiple samples
# heatmap of sample distances
library(pheatmap)
# calculate distances
sampleDists <- dist(t(norm_log_counts_filt))
sampleDistMatrix <- as.matrix(sampleDists)
# simple heatmap
pheatmap(sampleDistMatrix)

# change colors manually
library(RColorBrewer)
## for more colors: https://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(50)
pheatmap(sampleDistMatrix,
col=colors)

# add group labels
sample_table
## id dex celltype sample_name
## 1 SRR1039508 control N61311 C1
## 2 SRR1039509 treated N61311 T1
## 3 SRR1039512 control N052611 C2
## 4 SRR1039513 treated N052611 T2
## 5 SRR1039516 control N080611 C3
## 6 SRR1039517 treated N080611 T3
## 7 SRR1039520 control N061011 C4
## 8 SRR1039521 treated N061011 T4
rownames(sampleDistMatrix)
## [1] "C1" "T1" "C2" "T2" "C3" "T3" "C4" "T4"
annotation <- data.frame(row.names = sample_table$sample_name, #HAS to be the same as matrix
cell_type = sample_table$celltype,
treatment = sample_table$dex)
annotation
## cell_type treatment
## C1 N61311 control
## T1 N61311 treated
## C2 N052611 control
## T2 N052611 treated
## C3 N080611 control
## T3 N080611 treated
## C4 N061011 control
## T4 N061011 treated
pheatmap(sampleDistMatrix,
col=colors,
annotation_col = annotation)

## show examples from other datasets
# Plot 3) Plotting genes of interest
# A) plot individual genes
gene_of_interest <- "ENSG00000179094"
norm_counts_subset <- norm_log_counts[gene_of_interest,]
norm_counts_subset
## C1 T1 C2 T2 C3 T3 C4
## ENSG00000179094 7.475356 10.63531 7.273795 10.25692 7.846876 9.997636 7.516538
## T4
## ENSG00000179094 10.33049
data_to_plot <- data.frame(row.names = sample_table$sample_name,
cell_type = sample_table$celltype,
treatment = sample_table$dex,
norm_counts = as.numeric(norm_counts_subset))
data_to_plot
## cell_type treatment norm_counts
## C1 N61311 control 7.475356
## T1 N61311 treated 10.635307
## C2 N052611 control 7.273795
## T2 N052611 treated 10.256918
## C3 N080611 control 7.846876
## T3 N080611 treated 9.997636
## C4 N061011 control 7.516538
## T4 N061011 treated 10.330487
# basic plot
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
geom_point()

ggplot(data_to_plot, aes(x=cell_type, y=norm_counts)) +
geom_point()

ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
geom_point(size=3,aes(color=cell_type))

# add lines
ggplot(data_to_plot, aes(x=treatment, y=norm_counts, group=cell_type)) +
geom_point(size=3,aes(color=cell_type)) +
geom_line()

ggplot(data_to_plot, aes(x=treatment, y=norm_counts, group=cell_type)) +
geom_point(size=3,aes(color=cell_type)) +
geom_line(aes(color=cell_type))

# boxplots instead
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
geom_boxplot()

ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
geom_boxplot(aes(fill=treatment))

# add points
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
geom_boxplot() + geom_point()

# add labels
ggplot(data_to_plot, aes(x=treatment, y=norm_counts)) +
geom_boxplot(aes(fill=treatment)) + geom_point() +
labs(title=gene_of_interest)

# B) plot multiple genes
genes_of_interest <- c("ENSG00000152583",
"ENSG00000179094",
"ENSG00000116584",
"ENSG00000189221",
"ENSG00000120129",
"ENSG00000148175",
"ENSG00000178695",
"ENSG00000109906",
"ENSG00000134686",
"ENSG00000101347")
norm_counts_subset <- norm_log_counts[genes_of_interest,]
norm_counts_subset
## C1 T1 C2 T2 C3 T3
## ENSG00000152583 5.923209 11.089208 6.832592 10.645621 6.677878 10.367184
## ENSG00000179094 7.475356 10.635307 7.273795 10.256918 7.846876 9.997636
## ENSG00000116584 11.674119 10.566361 11.603869 10.423378 11.648400 10.508935
## ENSG00000189221 9.017501 12.186827 8.823067 12.059141 9.064100 12.135343
## ENSG00000120129 9.467130 12.515039 9.998428 12.690950 9.460522 12.090248
## ENSG00000148175 12.840301 14.322178 12.860022 14.001677 12.922685 14.191447
## ENSG00000178695 12.316632 9.941118 11.999167 9.254766 12.184040 9.915834
## ENSG00000109906 3.761790 9.875987 3.780489 9.479676 2.842242 8.804000
## ENSG00000134686 10.785697 12.012384 10.544461 12.002685 10.839301 12.002460
## ENSG00000101347 10.868827 14.424318 10.923317 14.938754 10.635319 13.704838
## C4 T4
## ENSG00000152583 6.470551 10.99161
## ENSG00000179094 7.516538 10.33049
## ENSG00000116584 11.584999 10.54297
## ENSG00000189221 8.062936 11.78962
## ENSG00000120129 9.617708 12.76383
## ENSG00000148175 12.910653 14.37836
## ENSG00000178695 12.296746 9.28792
## ENSG00000109906 4.863088 10.32481
## ENSG00000134686 10.504055 12.10096
## ENSG00000101347 11.085331 15.13546
# flip the table
t(norm_counts_subset)
## ENSG00000152583 ENSG00000179094 ENSG00000116584 ENSG00000189221
## C1 5.923209 7.475356 11.67412 9.017501
## T1 11.089208 10.635307 10.56636 12.186827
## C2 6.832592 7.273795 11.60387 8.823067
## T2 10.645621 10.256918 10.42338 12.059141
## C3 6.677878 7.846876 11.64840 9.064100
## T3 10.367184 9.997636 10.50893 12.135343
## C4 6.470551 7.516538 11.58500 8.062936
## T4 10.991606 10.330487 10.54297 11.789618
## ENSG00000120129 ENSG00000148175 ENSG00000178695 ENSG00000109906
## C1 9.467130 12.84030 12.316632 3.761790
## T1 12.515039 14.32218 9.941118 9.875987
## C2 9.998428 12.86002 11.999167 3.780489
## T2 12.690950 14.00168 9.254766 9.479676
## C3 9.460522 12.92268 12.184040 2.842242
## T3 12.090248 14.19145 9.915834 8.804000
## C4 9.617708 12.91065 12.296746 4.863088
## T4 12.763834 14.37836 9.287920 10.324809
## ENSG00000134686 ENSG00000101347
## C1 10.78570 10.86883
## T1 12.01238 14.42432
## C2 10.54446 10.92332
## T2 12.00269 14.93875
## C3 10.83930 10.63532
## T3 12.00246 13.70484
## C4 10.50405 11.08533
## T4 12.10096 15.13546
sample_table
## id dex celltype sample_name
## 1 SRR1039508 control N61311 C1
## 2 SRR1039509 treated N61311 T1
## 3 SRR1039512 control N052611 C2
## 4 SRR1039513 treated N052611 T2
## 5 SRR1039516 control N080611 C3
## 6 SRR1039517 treated N080611 T3
## 7 SRR1039520 control N061011 C4
## 8 SRR1039521 treated N061011 T4
metadata_subset <- data.frame(row.names = sample_table$sample_name,
cell_type = sample_table$celltype,
treatment = sample_table$dex)
metadata_subset
## cell_type treatment
## C1 N61311 control
## T1 N61311 treated
## C2 N052611 control
## T2 N052611 treated
## C3 N080611 control
## T3 N080611 treated
## C4 N061011 control
## T4 N061011 treated
data_to_plot <- cbind(metadata_subset,t(norm_counts_subset))
data_to_plot
## cell_type treatment ENSG00000152583 ENSG00000179094 ENSG00000116584
## C1 N61311 control 5.923209 7.475356 11.67412
## T1 N61311 treated 11.089208 10.635307 10.56636
## C2 N052611 control 6.832592 7.273795 11.60387
## T2 N052611 treated 10.645621 10.256918 10.42338
## C3 N080611 control 6.677878 7.846876 11.64840
## T3 N080611 treated 10.367184 9.997636 10.50893
## C4 N061011 control 6.470551 7.516538 11.58500
## T4 N061011 treated 10.991606 10.330487 10.54297
## ENSG00000189221 ENSG00000120129 ENSG00000148175 ENSG00000178695
## C1 9.017501 9.467130 12.84030 12.316632
## T1 12.186827 12.515039 14.32218 9.941118
## C2 8.823067 9.998428 12.86002 11.999167
## T2 12.059141 12.690950 14.00168 9.254766
## C3 9.064100 9.460522 12.92268 12.184040
## T3 12.135343 12.090248 14.19145 9.915834
## C4 8.062936 9.617708 12.91065 12.296746
## T4 11.789618 12.763834 14.37836 9.287920
## ENSG00000109906 ENSG00000134686 ENSG00000101347
## C1 3.761790 10.78570 10.86883
## T1 9.875987 12.01238 14.42432
## C2 3.780489 10.54446 10.92332
## T2 9.479676 12.00269 14.93875
## C3 2.842242 10.83930 10.63532
## T3 8.804000 12.00246 13.70484
## C4 4.863088 10.50405 11.08533
## T4 10.324809 12.10096 15.13546
# 'melt' the table
library(reshape2)
data_to_plot_melted <- melt(data_to_plot)
## Using cell_type, treatment as id variables
head(data_to_plot_melted)
## cell_type treatment variable value
## 1 N61311 control ENSG00000152583 5.923209
## 2 N61311 treated ENSG00000152583 11.089208
## 3 N052611 control ENSG00000152583 6.832592
## 4 N052611 treated ENSG00000152583 10.645621
## 5 N080611 control ENSG00000152583 6.677878
## 6 N080611 treated ENSG00000152583 10.367184
colnames(data_to_plot_melted) <- c("cell_type","treatment","gene","norm_counts")
head(data_to_plot_melted)
## cell_type treatment gene norm_counts
## 1 N61311 control ENSG00000152583 5.923209
## 2 N61311 treated ENSG00000152583 11.089208
## 3 N052611 control ENSG00000152583 6.832592
## 4 N052611 treated ENSG00000152583 10.645621
## 5 N080611 control ENSG00000152583 6.677878
## 6 N080611 treated ENSG00000152583 10.367184
# plot by gene
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
geom_point()

ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))

ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
geom_point(aes(color=treatment)) +
theme(axis.text.x = element_text(angle = 90))

# plot by treatment
ggplot(data_to_plot_melted, aes(x=treatment, y=norm_counts)) +
geom_point(aes(color=gene))

ggplot(data_to_plot_melted, aes(x=treatment, y=norm_counts)) +
geom_point(aes(color=gene)) +
stat_summary(fun = mean, geom="line", mapping = aes(group=gene, color=gene))

# final form:
ggplot(data_to_plot_melted, aes(x=treatment, y=norm_counts)) +
geom_point(aes(color=gene)) +
stat_summary(fun = mean, geom="line", mapping = aes(group=gene, color=gene)) +
labs(title="normalized counts of top 10 most variable genes",
subtitle="grouped by treatment",
y="log transformed normalized counts",
x="dex status")

# adding more stats:
# smaller subset (10 genes is too much)
genes_of_interest <- c("ENSG00000152583",
"ENSG00000179094",
"ENSG00000116584")
norm_counts_subset <- norm_log_counts[genes_of_interest,]
rownames(norm_counts_subset) <- c("SPARCL1","PER1","ARHGEF2")
metadata_subset <- data.frame(row.names = sample_table$sample_name,
cell_type = sample_table$celltype,
treatment = sample_table$dex)
data_to_plot <- cbind(metadata_subset,t(norm_counts_subset))
data_to_plot_melted <- melt(data_to_plot)
## Using cell_type, treatment as id variables
colnames(data_to_plot_melted) <- c("cell_type","treatment","gene","norm_counts")
head(data_to_plot_melted)
## cell_type treatment gene norm_counts
## 1 N61311 control SPARCL1 5.923209
## 2 N61311 treated SPARCL1 11.089208
## 3 N052611 control SPARCL1 6.832592
## 4 N052611 treated SPARCL1 10.645621
## 5 N080611 control SPARCL1 6.677878
## 6 N080611 treated SPARCL1 10.367184
# plot by gene
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
geom_point(aes(color=treatment))

# boxplot?
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
geom_boxplot(aes(fill=treatment))

# points might be better
ggplot(data_to_plot_melted, aes(x=gene, y=norm_counts)) +
geom_point(aes(color=treatment)) +
stat_summary(fun.min=function(x)(mean(x)-sd(x)),
fun.max=function(x)(mean(x)+sd(x)),
geom="errorbar", aes(group=treatment),
width=0.1, position=position_dodge(.4)) +
stat_summary(fun=mean, geom="point", shape=23, color="black", aes(fill=treatment),
size=2,position=position_dodge(.4))

## Homework for next session:
# install the DESeq2 package (from Bioconductor)